Loading and matching test and retest datasets

Load original data

test_data <- read.csv('/Users/zeynepenkavi/Documents/PoldrackLabLocal/Self_Regulation_Ontology/Data/Complete_01-31-2017/variables_exhaustive.csv')

Load retest data

retest_data <- read.csv('/Users/zeynepenkavi/Documents/PoldrackLabLocal/Self_Regulation_Ontology/Data/Retest_02-11-2017/variables_exhaustive.csv')

Function to process lookup table

process_lookup <- function(lookup_json){
  lookup_json <- data.frame(unlist(lookup_json))
  lookup_json$sub_id <- as.character(row.names(lookup_json))
  row.names(lookup_json) <- seq(1:nrow(lookup_json))
  names(lookup_json)[1] <- 'worker_id'
  lookup_json$worker_id <- as.character(lookup_json$worker_id)
  lookup_json$complete <- ifelse(lookup_json$worker_id == lookup_json$sub_id, 0, 1)
  lookup_json <- lookup_json %>% arrange(-complete, sub_id)
  return(lookup_json)
}

Extract retest participants from test data

test_worker_lookup <- fromJSON('/Users/zeynepenkavi/Documents/PoldrackLabLocal/Self_Regulation_Ontology/Data/Retest_02-11-2017/Local/worker_lookup.json')
#Process full lookup tables for both datasets
test_worker_lookup <- process_lookup(test_worker_lookup)
#Process sub_id columns in the data dataframes
retest_data$X <- as.character(retest_data$X)
test_data$X <- as.character(test_data$X)
names(retest_data)[1] <- 'sub_id'
names(test_data)[1] <- 'sub_id'
#Trim lookup table to only include those we have data for
test_worker_lookup <- test_worker_lookup[test_worker_lookup$sub_id %in% test_data$sub_id,]
#Replace worker_id's in retest_data with original sub_id's 
correct_subids <- function(row, lookup_table){
  sub_id = row$sub_id
  if(sub_id %in% lookup_table$worker_id){
    index = which(sub_id == lookup_table$worker_id)
    sub_id = lookup_table$sub_id[index]
    row$sub_id = sub_id
  }
  return(row)
}
retest_data <- retest_data %>%
  group_by(sub_id) %>%
  do(correct_subids(.,test_worker_lookup))

Are all retest subjects in test_data? No.

sum(retest_data$sub_id %in% test_data$sub_id) == nrow(retest_data)
[1] FALSE

Who is missing?

retest_data$sub_id[which(retest_data$sub_id %in% test_data$sub_id == FALSE)]
[1] "s442"

One retest participant is not in test data. Remove that subject from retest_data for now

retest_data <- retest_data[retest_data$sub_id %in% test_data$sub_id,]
retest_data

Extract test data for retest subjects

retest_subs_test_data <- test_data[test_data$sub_id %in% retest_data$sub_id,]
rest_subs_test_data <- test_data[test_data$sub_id %in% retest_data$sub_id == FALSE,]
#Arrange datasets of same size by sub_id
retest_data = retest_data %>% arrange(sub_id)
retest_subs_test_data = retest_subs_test_data %>% arrange(sub_id)
#CHECK IF EVERYTHING IS ORDERED RIGHT
retest_subs_test_data$sub_id == retest_data$sub_id
  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [76] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [91] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[106] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[121] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[136] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE

Drop columns from original data that are not in retest data

Are there any columns in retest data columns that are not in the original data: No.

names(retest_data)[names(retest_data) %in% names(retest_subs_test_data) == FALSE]
character(0)

Test data columns that are not in the retest data: Two stage hasn’t been fixed yet.

names(retest_subs_test_data)[names(retest_subs_test_data) %in% names(retest_data) == FALSE]
[1] "two_stage_decision.perseverance"

Datasets with only matching columns

all_columns <- unique(c(names(retest_data), names(retest_subs_test_data)))
matching_columns <- c()
for(i in 1:length(all_columns)){
  if(all_columns[i] %in% names(retest_data) & all_columns[i] %in% names(retest_subs_test_data)){
    matching_columns <- c(matching_columns, all_columns[i])
  }
}
retest_data <- retest_data[,matching_columns]
retest_subs_test_data <- retest_subs_test_data[,matching_columns]

Correlations of exhaustive variables

Get list of test retest correlations for all matching (numeric) columns in the test and retest data. Note two caveats: 1. There are still some columns that weren’t in both datasets (e.g. two stage) 2. Unlike meaningful variables these variables are not (yet) rid of outliers or log transformed if skewed. This will be detailed below along with its problems.

retest_data <- as.data.frame(retest_data)
retest_subs_test_data <- as.data.frame(retest_subs_test_data)
matching_dv_columns <- c()
for(i in 1:length(matching_columns)){
  if(is.numeric(retest_data[,matching_columns[i]]) & is.numeric(retest_subs_test_data[,matching_columns[i]])){
    matching_dv_columns <- c(matching_dv_columns, matching_columns[i])
  }
}
cor_df <- data.frame(pearson = rep(NA, length(matching_dv_columns)), spearman = rep(NA, length(matching_dv_columns)))
row.names(cor_df) <- matching_dv_columns
for(i in 1:length(matching_dv_columns)){
  cor_df[matching_dv_columns[i], 'pearson'] <- cor(retest_data[,matching_dv_columns[i]], retest_subs_test_data[,matching_dv_columns[i]], method = 'pearson', use = "pairwise.complete.obs")
  cor_df[matching_dv_columns[i], 'spearman'] <- cor(retest_data[,matching_dv_columns[i]],retest_subs_test_data[,matching_dv_columns[i]], method = 'spearman', use = "pairwise.complete.obs")
}

Distribution of all Pearson and Spearman correlations

Add column to tasks vs. surveys in correlation table

cor_df$dv = row.names(cor_df)
row.names(cor_df) = seq(1:nrow(cor_df))
cor_df$task = 'task'
cor_df[grep('survey', cor_df$dv), 'task'] = 'survey'

Distributions of correlations by task vs. survey

Survey reliabilities are noticably higher than task reliabilities.

Task reliability distributions looks alsmost bimodal. Should figure out whether the lower ones are specific tasks or if they are certain kinds of vars (e.g. certain ddm parameters)

cor_df %>%
  gather(key, value, -dv, -task) %>%
  ggplot(aes(value))+
  geom_histogram()+
  theme_bw()+
  facet_wrap(key~task)

Rank order correlations For tasks

cor_df %>%
  filter(task == 'task') %>%
  select(dv, spearman, pearson) %>%
  arrange(-pearson, -spearman)

Mean reliability for tasks

For surveys

Note: Holt and Laury test-retest is pretty terrible. In previous data it was ~.35. Should check to see if there is a coding error there.

cor_df %>%
  filter(task == 'survey') %>%
  select(dv, spearman, pearson) %>%
  arrange(-pearson, -spearman)

Mean reliability for surveys

Distribution of each variable

Check if the variables that went in to the correlation matrix have normal distributions since at least the Pearson correlations depend on this assumption. If not transform data columns in both retest and test data and calculate a new correlation df.

Function to check if the variables that went in to cor_df have normal distributions

diff_from_normal <- function(col){
  return(shapiro.test(col)$p.value < 0.05)
}

Check distributions of retest data

The smaller meaningful vars dataframe have been somewhat cleaned. So far in this notebook we looked at all the vars in their rawest form. Here we’ll first check the distributions of all the variables and then apply the same cleaning procedure that was used for the meaningful variables to all of them.

Note that I’m doing this for the sake of consistency for now. We might later decide that we would like to transform certain variables of interest in other ways to be able to include them in our analyses without violating the assumptions of those analyses. The current procedure checks the skewness of a column

Create and organize new df where distribution of vars data is stored. First for retest data only; then for test data for subjects who have retest data - restricting here since these are the subsets of data that went in to the correlation matrix. Merge and cleanup

dist_retest_data <- as.data.frame(apply(retest_data[,matching_dv_columns], 2, diff_from_normal))
names(dist_retest_data) <- c('retest_not_normal')
dist_retest_data$dv <- row.names(dist_retest_data)
row.names(dist_retest_data) <- seq(1:nrow(dist_retest_data))
dist_retest_data = dist_retest_data %>% select(dv, retest_not_normal)
dist_retest_subs_test_data <- as.data.frame(apply(retest_subs_test_data[,matching_dv_columns], 2, diff_from_normal))
names(dist_retest_subs_test_data) <- c('test_not_normal')
dist_retest_subs_test_data$dv <- row.names(dist_retest_subs_test_data)
row.names(dist_retest_subs_test_data) <- seq(1:nrow(dist_retest_subs_test_data))
dist_retest_subs_test_data = dist_retest_subs_test_data %>% select(dv, test_not_normal)
dist_data <- merge(dist_retest_data, dist_retest_subs_test_data, by = 'dv')
rm(dist_retest_data, dist_retest_subs_test_data)

Add whether dv’s are tasks or survey to distribution dfs

cor_df %>%
  select(dv, task) %>%
  left_join(dist_data, by = 'dv')

What proportion is not normal? A large proportion of the variables are not normal for data from both sources. Larger proportions of survey data are not normal compared to task data.

Are the same vars that are not normally distributed in the retest data not normal in test data as well? List of variables that have different distributions in the two time points (shown here is whether they are normally distributed in the retest data) A majority of them are drift diffusion parameters (but note that these vars are overrepresented to begin with - more on them later).

Apply the same cleaning procedure that was used for meaningful vars for all vars:

transform_remove_skew = function(data, columns, threshold = 1){
  
  tmp = as.data.frame(apply(data[,columns],2,skew))
  names(tmp) = c("skew")
  tmp$dv = row.names(tmp)
  tmp = tmp %>% 
    filter(abs(skew)>threshold)
  
  skewed_variables = tmp$dv
  skew_subset = data[, skewed_variables]
  positive_subset = data[,tmp$dv[tmp$skew>0]]
  negative_subset = data[,tmp$dv[tmp$skew<0]]
  
  # transform variables
  # log transform for positive skew
  positive_subset = log(positive_subset)
  successful_transforms = as.data.frame(apply(positive_subset, 2, skew))
  names(successful_transforms) = c('skew')
  successful_transforms$dv = row.names(successful_transforms)
  successful_transforms = successful_transforms %>% filter(abs(skew)<threshold)
  successful_transforms = successful_transforms$dv
  successful_transforms = positive_subset[,successful_transforms]
  dropped_vars = names(positive_subset) %w/o% names(successful_transforms)
  
  # replace transformed variables
  data = data[,-c(which(names(data) %in% names(positive_subset)))]
  names(successful_transforms) = paste0(names(successful_transforms), '.logTr')
  print.noquote(rep('*', 40))
  print(paste0('Dropping ', length(dropped_vars) ,' positively skewed data that could not be transformed successfully:'))
  print(dropped_vars)
  print.noquote(rep('*', 40))
  data = cbind(data, successful_transforms)
  
  # reflected log transform for negative skew      
  negative_subset = as.data.frame(apply(negative_subset, 2, neg_log))
  successful_transforms = as.data.frame(apply(negative_subset, 2, skew))
  names(successful_transforms) = c('skew')
  successful_transforms$dv = row.names(successful_transforms)
  successful_transforms = successful_transforms %>% filter(abs(skew)<1)
  successful_transforms = successful_transforms$dv
  successful_transforms = negative_subset[,successful_transforms]
  dropped_vars = names(negative_subset) %w/o% names(successful_transforms)
  
  # replace transformed variables
  data = data[,-c(which(names(data) %in% names(negative_subset)))]
  names(successful_transforms) = paste0(names(successful_transforms), '.ReflogTr')
  print.noquote(rep('*', 40))
  print(paste0('Dropping ', length(dropped_vars) ,' negatively skewed data that could not be transformed successfully:'))
  print(dropped_vars)
  print.noquote(rep('*', 40))
  data = cbind(data, successful_transforms)
  
  return(data)
}
transform_remove_skew(retest_data, matching_dv_columns)
NaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs produced
 [1] * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
[1] "Dropping 39 positively skewed data that could not be transformed successfully:"
 [1] "adaptive_n_back.missed_percent"                            
 [2] "attention_network_task.avg_rt_error"                       
 [3] "attention_network_task.missed_percent"                     
 [4] "bickel_titrator.hyp_discount_rate_large"                   
 [5] "choice_reaction_time.avg_rt_error"                         
 [6] "choice_reaction_time.missed_percent"                       
 [7] "choice_reaction_time.std_rt_error"                         
 [8] "demographics_survey.arrest_count"                          
 [9] "demographics_survey.caffeine_intake"                       
[10] "demographics_survey.car_debt"                              
[11] "demographics_survey.divoce_count"                          
[12] "demographics_survey.longest_relationship.months."          
[13] "demographics_survey.mortage_debt"                          
[14] "demographics_survey.other_sources_of_debt"                 
[15] "demographics_survey.traffic_ticket_count"                  
[16] "dickman_survey.dysfunctional"                              
[17] "dietary_decision.health_sensitivity"                       
[18] "directed_forgetting.missed_percent"                        
[19] "discount_titrate.hyp_discount_rate_nm"                     
[20] "dot_pattern_expectancy.EZ_non_decision_BX"                 
[21] "dot_pattern_expectancy.missed_percent"                     
[22] "hierarchical_rule.missed_percent"                          
[23] "holt_laury_survey.number_of_switches"                      
[24] "impulsive_venture_survey.impulsiveness"                    
[25] "local_global_letter.missed_percent"                        
[26] "local_global_letter.std_rt_error"                          
[27] "probabilistic_selection.missed_percent"                    
[28] "psychological_refractory_period_two_choices.missed_percent"
[29] "recent_probes.missed_percent"                              
[30] "shape_matching.missed_percent"                             
[31] "shift_task.missed_percent"                                 
[32] "simon.missed_percent"                                      
[33] "simple_reaction_time.missed_percent"                       
[34] "stop_signal.proactive_SSRT_speeding"                       
[35] "stroop.missed_percent"                                     
[36] "stroop.post_error_slowing"                                 
[37] "threebytwo.cue_switch_cost_rt_100.0"                       
[38] "threebytwo.missed_percent"                                 
[39] "two_stage_decision.missed_percent"                         
 [1] * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 [1] * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
[1] "Dropping 35 negatively skewed data that could not be transformed successfully:"
 [1] "attention_network_task.acc"                           
 [2] "attention_network_task.conflict_acc"                  
 [3] "attention_network_task.orienting_acc"                 
 [4] "choice_reaction_time.acc"                             
 [5] "columbia_card_task_cold.loss_sensitivity"             
 [6] "directed_forgetting.acc"                              
 [7] "discount_titrate.hyp_discount_rate_glm"               
 [8] "discount_titrate.hyp_discount_rate_glm_notnow"        
 [9] "discount_titrate.hyp_discount_rate_glm_now"           
[10] "dot_pattern_expectancy.AY.BY_acc"                     
[11] "dot_pattern_expectancy.BX.BY_EZ_drift"                
[12] "dot_pattern_expectancy.BX.BY_acc"                     
[13] "dot_pattern_expectancy.BX.BY_rt"                      
[14] "dot_pattern_expectancy.acc"                           
[15] "go_nogo.acc"                                          
[16] "go_nogo.go_acc"                                       
[17] "information_sampling_task.Fixed_Win_acc"              
[18] "local_global_letter.acc"                              
[19] "motor_selective_stop_signal.go_acc"                   
[20] "motor_selective_stop_signal.stop_acc"                 
[21] "psychological_refractory_period_two_choices.task1_acc"
[22] "psychological_refractory_period_two_choices.task2_acc"
[23] "recent_probes.acc"                                    
[24] "shape_matching.EZ_non_decision_SSS"                   
[25] "shape_matching.acc"                                   
[26] "shape_matching.hddm_non_decision"                     
[27] "simon.acc"                                            
[28] "simon.hddm_non_decision"                              
[29] "stim_selective_stop_signal.go_acc"                    
[30] "stim_selective_stop_signal.stop_acc"                  
[31] "stop_signal.go_acc"                                   
[32] "stop_signal.stop_acc"                                 
[33] "stroop.acc"                                           
[34] "stroop.stroop_acc"                                    
[35] "threebytwo.acc"                                       
 [1] * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
clean_retest_data = transform_remove_skew(clean_retest_data, matching_dv_columns)
NaNs produced
 [1] * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
[39] * *
[1] "Dropping 18 positively skewed data that could not be transformed successfully:"
 [1] "adaptive_n_back.missed_percent"                            
 [2] "attention_network_task.missed_percent"                     
 [3] "bickel_titrator.hyp_discount_rate_small"                   
 [4] "demographics_survey.caffeine_intake"                       
 [5] "demographics_survey.car_debt"                              
 [6] "demographics_survey.mortage_debt"                          
 [7] "directed_forgetting.missed_percent"                        
 [8] "dot_pattern_expectancy.missed_percent"                     
 [9] "hierarchical_rule.missed_percent"                          
[10] "impulsive_venture_survey.impulsiveness"                    
[11] "local_global_letter.missed_percent"                        
[12] "probabilistic_selection.missed_percent"                    
[13] "psychological_refractory_period_two_choices.missed_percent"
[14] "shape_matching.missed_percent"                             
[15] "shift_task.missed_percent"                                 
[16] "stroop.missed_percent"                                     
[17] "threebytwo.missed_percent"                                 
[18] "two_stage_decision.missed_percent"                         
 [1] * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
[39] * *
 [1] * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
[39] * *
[1] "Dropping 7 negatively skewed data that could not be transformed successfully:"
[1] "attention_network_task.acc"                           
[2] "choice_reaction_time.acc"                             
[3] "directed_forgetting.acc"                              
[4] "go_nogo.go_acc"                                       
[5] "motor_selective_stop_signal.stop_acc"                 
[6] "psychological_refractory_period_two_choices.task1_acc"
[7] "psychological_refractory_period_two_choices.task2_acc"
 [1] * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
[39] * *

I’m not sure what is the right thing to do regarding the cleaning of the test data for the retest subjects. Should they be cleaned on the whole sample for the time they were collected on and then extracted (this should have an effect on outliers as well as transformations) or extracted and then cleaned. Here I am doing the latter but this seems to be resulting in more skewed variables.

clean_retest_subs_test_data = transform_remove_skew(retest_subs_test_data, matching_dv_columns)
NaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs produced
 [1] * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
[39] * *
[1] "Dropping 42 positively skewed data that could not be transformed successfully:"
 [1] "adaptive_n_back.missed_percent"                            
 [2] "adaptive_n_back.std_rt"                                    
 [3] "attention_network_task.avg_rt_error"                       
 [4] "attention_network_task.missed_percent"                     
 [5] "choice_reaction_time.missed_percent"                       
 [6] "choice_reaction_time.std_rt_error"                         
 [7] "columbia_card_task_cold.gain_sensitivity"                  
 [8] "demographics_survey.arrest_count"                          
 [9] "demographics_survey.caffeine_intake"                       
[10] "demographics_survey.car_debt"                              
[11] "demographics_survey.divoce_count"                          
[12] "demographics_survey.household_income.dollars."             
[13] "demographics_survey.longest_relationship.months."          
[14] "demographics_survey.mortage_debt"                          
[15] "demographics_survey.other_sources_of_debt"                 
[16] "demographics_survey.percent_retirement_in_stock"           
[17] "demographics_survey.traffic_ticket_count"                  
[18] "dickman_survey.dysfunctional"                              
[19] "directed_forgetting.missed_percent"                        
[20] "directed_forgetting.proactive_interference_acc"            
[21] "dot_pattern_expectancy.missed_percent"                     
[22] "dot_pattern_expectancy.std_rt"                             
[23] "dot_pattern_expectancy.std_rt_error"                       
[24] "hierarchical_rule.missed_percent"                          
[25] "holt_laury_survey.number_of_switches"                      
[26] "impulsive_venture_survey.impulsiveness"                    
[27] "local_global_letter.missed_percent"                        
[28] "local_global_letter.switch_cost_rt"                        
[29] "probabilistic_selection.missed_percent"                    
[30] "psychological_refractory_period_two_choices.missed_percent"
[31] "recent_probes.missed_percent"                              
[32] "shape_matching.missed_percent"                             
[33] "shift_task.missed_percent"                                 
[34] "simon.missed_percent"                                      
[35] "simple_reaction_time.missed_percent"                       
[36] "stim_selective_stop_signal.go_rt_std"                      
[37] "stim_selective_stop_signal.stop_rt_error_std"              
[38] "stroop.missed_percent"                                     
[39] "threebytwo.cue_switch_cost_rt_100.0"                       
[40] "threebytwo.missed_percent"                                 
[41] "threebytwo.task_inhibition_900.0"                          
[42] "two_stage_decision.missed_percent"                         
 [1] * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
[39] * *
 [1] * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
[39] * *
[1] "Dropping 37 negatively skewed data that could not be transformed successfully:"
 [1] "adaptive_n_back.acc"                                  
 [2] "attention_network_task.acc"                           
 [3] "attention_network_task.conflict_acc"                  
 [4] "attention_network_task.hddm_non_decision"             
 [5] "choice_reaction_time.acc"                             
 [6] "columbia_card_task_cold.loss_sensitivity"             
 [7] "columbia_card_task_hot.loss_sensitivity"              
 [8] "directed_forgetting.acc"                              
 [9] "directed_forgetting.proactive_interference_rt"        
[10] "dot_pattern_expectancy.BX.BY_acc"                     
[11] "dot_pattern_expectancy.BX.BY_rt"                      
[12] "dot_pattern_expectancy.EZ_non_decision_AX"            
[13] "dot_pattern_expectancy.acc"                           
[14] "go_nogo.acc"                                          
[15] "go_nogo.go_acc"                                       
[16] "hierarchical_rule.post_error_slowing"                 
[17] "information_sampling_task.Fixed_Win_acc"              
[18] "motor_selective_stop_signal.go_acc"                   
[19] "motor_selective_stop_signal.stop_acc"                 
[20] "probabilistic_selection.positive_learning_bias"       
[21] "psychological_refractory_period_two_choices.task1_acc"
[22] "psychological_refractory_period_two_choices.task2_acc"
[23] "recent_probes.acc"                                    
[24] "shape_matching.acc"                                   
[25] "shape_matching.hddm_non_decision"                     
[26] "simon.acc"                                            
[27] "simon.post_error_slowing"                             
[28] "simon.simon_acc"                                      
[29] "stim_selective_stop_signal.EZ_non_decision"           
[30] "stim_selective_stop_signal.go_acc"                    
[31] "stim_selective_stop_signal.hddm_non_decision"         
[32] "stop_signal.go_acc"                                   
[33] "stop_signal.go_rt_error"                              
[34] "stop_signal.hddm_non_decision"                        
[35] "stroop.acc"                                           
[36] "stroop.stroop_acc"                                    
[37] "threebytwo.acc"                                       
 [1] * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
[39] * *

But what about dropping different variables from the two dataframes

Drift diffusion variables: raw vars vs. EZ vs. hddm

Mean reliabilities for different kinds of variables (e.g. drift rates and NDs; separately for EZ and HDDM; subtraction vs basic variables - single vs. multiple)


Can you get the factor structure for the retest step 1:Ttest on all measures of T1 for people w and without retest

step2 : efa on the people you don’t have retest on and then predict how well this holds up for people w retest on t1 and on t2

Data cleaning for efa

#read in only meaningful vars
meaningful_var_test <- read.csv('/Users/zeynepenkavi/Documents/PoldrackLabLocal/Self_Regulation_Ontology/Data/Complete_01-31-2017/meaningful_variables_clean.csv')

meaningful_var_retest <- read.csv('/Users/zeynepenkavi/Documents/PoldrackLabLocal/Self_Regulation_Ontology/Data/Retest_02-11-2017/meaningful_variables_clean.csv')

#Process sub_id columns in the data dataframes
meaningful_var_retest$X <- as.character(meaningful_var_retest$X)
meaningful_var_test$X <- as.character(meaningful_var_test$X)

names(meaningful_var_retest)[1] <- 'sub_id'
names(meaningful_var_test)[1] <- 'sub_id'

#Clean and subset dfs as before
meaningful_var_retest <- meaningful_var_retest %>%
  group_by(sub_id) %>%
  do(correct_subids(.,test_worker_lookup))

#Are all retest subjects in test_data? No. 
sum(meaningful_var_retest$sub_id %in% meaningful_var_test$sub_id) == nrow(meaningful_var_retest)

#Who is missing?
meaningful_var_retest$sub_id[which(meaningful_var_retest$sub_id %in% meaningful_var_test$sub_id == FALSE)]

#Remove sub w missing test data from retest data
retest_data_mngf <- meaningful_var_retest[meaningful_var_retest$sub_id %in% meaningful_var_test$sub_id,]

#Extract test data for subs w retest data
retest_subs_test_mngf <- meaningful_var_test[meaningful_var_test$sub_id %in% meaningful_var_retest$sub_id,]

#Extract test data for subs without retest data (efa df)
rest_subs_test_mngf <- meaningful_var_test[meaningful_var_test$sub_id %in% meaningful_var_retest$sub_id == FALSE,]

#Arrange datasets of same size by sub_id
retest_data_mngf = retest_data_mngf %>% arrange(sub_id)
retest_subs_test_mngf = retest_subs_test_mngf %>% arrange(sub_id)

#CHECK IF EVERYTHING IS ORDERED RIGHT
retest_subs_test_mngf$sub_id == retest_data_mngf$sub_id

Checking columns for efa: NEED TO RESOLVE SOME DIFFERENCES IN MEANINGFUL VARS BETWEEN THE TWO DATASETS

#Check columns 
names(retest_data_mngf)[names(retest_data_mngf) %in% names(retest_subs_test_mngf) == FALSE]

names(retest_subs_test_mngf)[names(retest_subs_test_mngf) %in% names(retest_data_mngf) == FALSE]

#Subset datasets with only matching columns
all_columns_mngf <- unique(c(names(retest_data_mngf), names(retest_subs_test_mngf)))

matching_columns_mngf <- c()
for(i in 1:length(all_columns_mngf)){
  if(all_columns_mngf[i] %in% names(retest_data_mngf) & all_columns_mngf[i] %in% names(retest_subs_test_mngf)){
    matching_columns_mngf <- c(matching_columns_mngf, all_columns_mngf[i])
  }
}

retest_data_mngf <- retest_data_mngf[,matching_columns_mngf]
retest_subs_test_mngf <- retest_subs_test_mngf[,matching_columns_mngf]
rest_subs_test_mngf <- rest_subs_test_mngf[,matching_columns_mngf]

#standardize all vars going in to factor analysis
rest_subs_test_mngf_z <- rest_subs_test_mngf

for(i in 1:length(rest_subs_test_mngf)){
  if(names(rest_subs_test_mngf)[i] == 'sub_id'){
    rest_subs_test_mngf_z$sub_id = rest_subs_test_mngf$sub_id
  }
  else{
    rest_subs_test_mngf_z[,names(rest_subs_test_mngf)[i]] = scale(rest_subs_test_mngf[,i])
  }
}

rest_subs_test_mngf_z <- as.data.frame(rest_subs_test_mngf_z)

Efa on subjects from original dataset withOUT retest data (using meaningful vars only)

unrotated <- efaUnrotate(rest_subs_test_mngf_z, nf=3, varList=names(rest_subs_test_mngf_z)[-1], estimator="mlr")

summary(unrotated, std=TRUE)
inspect(unrotated, "std")
corMat <- rest_subs_test_mngf_z %>% select(-sub_id) %>% cor(.,use = 'complete.obs')

solution <- fa(r = corMat, nfactors = 3, rotate = "oblimin", fm = "pa")
solution

Completion times/days

T1 comparison for people who came back vs didn’t (of all that are invited)

---
title: 'Self Regulation Ontology Retest Data: Initial Exploration'
output:
  html_notebook: default
  pdf_document: default
  html_document:
    toc: true
    toc_depth: 2
    toc_float: true
---

```{r, message=FALSE, warning=FALSE, include=FALSE}
library(dplyr)
library(tidyr)
library(ggplot2)
library(gridExtra)
library(lme4)
library(GGally)
library(jsonlite)
library(lavaan)
library(semTools)
library(psych)
library(GPArotation)
sem <- function(x) {sd(x) / sqrt(length(x))}
options(digits = 4)
```

# Loading and matching test and retest datasets

Load original data
```{r}
test_data <- read.csv('/Users/zeynepenkavi/Documents/PoldrackLabLocal/Self_Regulation_Ontology/Data/Complete_01-31-2017/variables_exhaustive.csv')
```

Load retest data
```{r}
retest_data <- read.csv('/Users/zeynepenkavi/Documents/PoldrackLabLocal/Self_Regulation_Ontology/Data/Retest_02-11-2017/variables_exhaustive.csv')
```

Function to process lookup table
```{r}
process_lookup <- function(lookup_json){
  lookup_json <- data.frame(unlist(lookup_json))
  lookup_json$sub_id <- as.character(row.names(lookup_json))
  row.names(lookup_json) <- seq(1:nrow(lookup_json))
  names(lookup_json)[1] <- 'worker_id'
  lookup_json$worker_id <- as.character(lookup_json$worker_id)
  lookup_json$complete <- ifelse(lookup_json$worker_id == lookup_json$sub_id, 0, 1)
  lookup_json <- lookup_json %>% arrange(-complete, sub_id)
  return(lookup_json)
}
```

Extract retest participants from test data

```{r}
test_worker_lookup <- fromJSON('/Users/zeynepenkavi/Documents/PoldrackLabLocal/Self_Regulation_Ontology/Data/Retest_02-11-2017/Local/worker_lookup.json')

#Process full lookup tables for both datasets
test_worker_lookup <- process_lookup(test_worker_lookup)

#Process sub_id columns in the data dataframes
retest_data$X <- as.character(retest_data$X)
test_data$X <- as.character(test_data$X)

names(retest_data)[1] <- 'sub_id'
names(test_data)[1] <- 'sub_id'

#Trim lookup table to only include those we have data for
test_worker_lookup <- test_worker_lookup[test_worker_lookup$sub_id %in% test_data$sub_id,]

#Replace worker_id's in retest_data with original sub_id's 
correct_subids <- function(row, lookup_table){
  sub_id = row$sub_id
  if(sub_id %in% lookup_table$worker_id){
    index = which(sub_id == lookup_table$worker_id)
    sub_id = lookup_table$sub_id[index]
    row$sub_id = sub_id
  }
  return(row)
}

retest_data <- retest_data %>%
  group_by(sub_id) %>%
  do(correct_subids(.,test_worker_lookup))
```

Are all retest subjects in test_data? No. 
```{r}
sum(retest_data$sub_id %in% test_data$sub_id) == nrow(retest_data)
```

Who is missing?
```{r}
retest_data$sub_id[which(retest_data$sub_id %in% test_data$sub_id == FALSE)]
```

One retest participant is not in test data. Remove that subject from retest_data for now
```{r}
retest_data <- retest_data[retest_data$sub_id %in% test_data$sub_id,]

retest_data
```

Extract test data for retest subjects
```{r}
retest_subs_test_data <- test_data[test_data$sub_id %in% retest_data$sub_id,]

rest_subs_test_data <- test_data[test_data$sub_id %in% retest_data$sub_id == FALSE,]

#Arrange datasets of same size by sub_id
retest_data = retest_data %>% arrange(sub_id)
retest_subs_test_data = retest_subs_test_data %>% arrange(sub_id)

#CHECK IF EVERYTHING IS ORDERED RIGHT
retest_subs_test_data$sub_id == retest_data$sub_id
```

Drop columns from original data that are not in retest data

Are there any columns in retest data columns that are not in the original data: No.

```{r}
names(retest_data)[names(retest_data) %in% names(retest_subs_test_data) == FALSE]
```

Test data columns that are not in the retest data: Two stage hasn't been fixed yet.

```{r}
names(retest_subs_test_data)[names(retest_subs_test_data) %in% names(retest_data) == FALSE]
```

Datasets with only matching columns

```{r}
all_columns <- unique(c(names(retest_data), names(retest_subs_test_data)))

matching_columns <- c()
for(i in 1:length(all_columns)){
  if(all_columns[i] %in% names(retest_data) & all_columns[i] %in% names(retest_subs_test_data)){
    matching_columns <- c(matching_columns, all_columns[i])
  }
}

retest_data <- retest_data[,matching_columns]
retest_subs_test_data <- retest_subs_test_data[,matching_columns]
```

# Correlations of exhaustive variables

Get list of test retest correlations for all matching (numeric) columns in the test and retest data. Note two caveats:
1. There are still some columns that weren't in both datasets (e.g. two stage)
2. Unlike meaningful variables these variables are not (yet) rid of outliers or log transformed if skewed. This will be detailed below along with its problems.

```{r}
retest_data <- as.data.frame(retest_data)
retest_subs_test_data <- as.data.frame(retest_subs_test_data)

matching_dv_columns <- c()

for(i in 1:length(matching_columns)){
  if(is.numeric(retest_data[,matching_columns[i]]) & is.numeric(retest_subs_test_data[,matching_columns[i]])){
    matching_dv_columns <- c(matching_dv_columns, matching_columns[i])
  }
}

cor_df <- data.frame(pearson = rep(NA, length(matching_dv_columns)), spearman = rep(NA, length(matching_dv_columns)))

row.names(cor_df) <- matching_dv_columns

for(i in 1:length(matching_dv_columns)){
  cor_df[matching_dv_columns[i], 'pearson'] <- cor(retest_data[,matching_dv_columns[i]], retest_subs_test_data[,matching_dv_columns[i]], method = 'pearson', use = "pairwise.complete.obs")
  cor_df[matching_dv_columns[i], 'spearman'] <- cor(retest_data[,matching_dv_columns[i]],retest_subs_test_data[,matching_dv_columns[i]], method = 'spearman', use = "pairwise.complete.obs")
}
```

Distribution of all Pearson and Spearman correlations
```{r}
cor_df %>%
  gather(key, value) %>%
  ggplot(aes(value))+
  geom_histogram()+
  theme_bw()+
  facet_wrap(~key)
```

Add column to tasks vs. surveys in correlation table

```{r}
cor_df$dv = row.names(cor_df)
row.names(cor_df) = seq(1:nrow(cor_df))
cor_df$task = 'task'
cor_df[grep('survey', cor_df$dv), 'task'] = 'survey'
```

Distributions of correlations by task vs. survey

Survey reliabilities are noticably higher than task reliabilities.

Task reliability distributions looks alsmost bimodal. **Should figure out whether the lower ones are specific tasks or if they are certain kinds of vars (e.g. certain ddm parameters)**
```{r}
cor_df %>%
  gather(key, value, -dv, -task) %>%
  ggplot(aes(value))+
  geom_histogram()+
  theme_bw()+
  facet_wrap(key~task)
```

Rank order correlations
For tasks
```{r}
cor_df %>%
  filter(task == 'task') %>%
  select(dv, spearman, pearson) %>%
  arrange(-pearson, -spearman)
```

Mean reliability for tasks
```{r}
cor_df %>%
  filter(task == 'task') %>%
  select(dv, spearman, pearson) %>%
  arrange(-pearson, -spearman) %>%
  summarise(mean_pearson = mean(pearson),
            mean_spearman = mean(spearman),
            num_vars = n())
```

For surveys

Note: Holt and Laury test-retest is pretty terrible. In previous data it was ~.35. Should check to see if there is a coding error there.
```{r}
cor_df %>%
  filter(task == 'survey') %>%
  select(dv, spearman, pearson) %>%
  arrange(-pearson, -spearman)
```

Mean reliability for surveys
```{r}
cor_df %>%
  filter(task == 'survey') %>%
  select(dv, spearman, pearson) %>%
  arrange(-pearson, -spearman) %>%
  summarise(mean_pearson = mean(pearson),
            mean_spearman = mean(spearman),
            num_vars = n())
```

Distribution of each variable

Check if the variables that went in to the correlation matrix have normal distributions since at least the Pearson correlations depend on this assumption. If not transform data columns in both retest and test data and calculate a new correlation df.

Function to check if the variables that went in to cor_df have normal distributions
```{r}
diff_from_normal <- function(col){
  return(shapiro.test(col)$p.value < 0.05)
}
```

Check distributions of retest data

The smaller meaningful vars dataframe have been somewhat cleaned. So far in this notebook we looked at all the vars in their rawest form. Here we'll first check the distributions of all the variables and then apply the same cleaning procedure that was used for the meaningful variables to all of them. 

Note that I'm doing this for the sake of consistency for now. We might later decide that we would like to transform certain variables of interest in other ways to be able to include them in our analyses without violating the assumptions of those analyses. The current procedure checks the skewness of a column

Create and organize new df where distribution of vars data is stored. First for retest data only; then for test data for subjects who have retest data - restricting here since these are the subsets of data that went in to the correlation matrix. Merge and cleanup
```{r}
dist_retest_data <- as.data.frame(apply(retest_data[,matching_dv_columns], 2, diff_from_normal))
names(dist_retest_data) <- c('retest_not_normal')
dist_retest_data$dv <- row.names(dist_retest_data)
row.names(dist_retest_data) <- seq(1:nrow(dist_retest_data))
dist_retest_data = dist_retest_data %>% select(dv, retest_not_normal)

dist_retest_subs_test_data <- as.data.frame(apply(retest_subs_test_data[,matching_dv_columns], 2, diff_from_normal))
names(dist_retest_subs_test_data) <- c('test_not_normal')
dist_retest_subs_test_data$dv <- row.names(dist_retest_subs_test_data)
row.names(dist_retest_subs_test_data) <- seq(1:nrow(dist_retest_subs_test_data))
dist_retest_subs_test_data = dist_retest_subs_test_data %>% select(dv, test_not_normal)

dist_data <- merge(dist_retest_data, dist_retest_subs_test_data, by = 'dv')

rm(dist_retest_data, dist_retest_subs_test_data)
```

Add whether dv's are tasks or survey to distribution dfs
```{r}
dist_data = cor_df %>%
  select(dv, task) %>%
  left_join(dist_data, by = 'dv')
```

What proportion is not normal?
A large proportion of the variables are not normal for data from both sources. Larger proportions of survey data are not normal compared to task data. 
```{r}
dist_data %>%
  group_by(task) %>%
  summarise(mean_retest_not_normal  = mean(retest_not_normal),
            mean_test_not_normal = mean(test_not_normal))
```

Are the same vars that are not normally distributed in the retest data not normal in test data as well?
List of variables that have different distributions in the two time points (shown here is whether they are normally distributed in the retest data)
A majority of them are drift diffusion parameters (but note that these vars are overrepresented to begin with - more on them later).
```{r}
dist_data %>%
  filter(retest_not_normal != test_not_normal)
```

Apply the same cleaning procedure that was used for meaningful vars for all vars:
```{r}
remove_outliers = function(data_column, quantile_range = 2.5){
  
  q_25 = quantile(data_column, na.rm=T)[2]
  q_50 = quantile(data_column, na.rm=T)[3]
  q_75 = quantile(data_column, na.rm=T)[4]
  
  lowlimit = q_50 - quantile_range*(q_75 - q_25)
  highlimit = q_50 + quantile_range*(q_75 - q_25)
  
  data_column = ifelse(data_column<lowlimit, NA, ifelse(data_column>highlimit, NA, data_column))
  
  return(data_column)
}

"%w/o%" <- function(x, y) x[!x %in% y]

neg_log <- function(column){
  col_max = max(column, na.rm=T)
  column = col_max+1-column
  return(log(column))
}

transform_remove_skew = function(data, columns, threshold = 1){
  
  tmp = as.data.frame(apply(data[,columns],2,skew))
  names(tmp) = c("skew")
  tmp$dv = row.names(tmp)
  tmp = tmp %>% 
    filter(abs(skew)>threshold)
  
  skewed_variables = tmp$dv
  skew_subset = data[, skewed_variables]
  positive_subset = data[,tmp$dv[tmp$skew>0]]
  negative_subset = data[,tmp$dv[tmp$skew<0]]
  
  # transform variables
  # log transform for positive skew
  positive_subset = log(positive_subset)
  successful_transforms = as.data.frame(apply(positive_subset, 2, skew))
  names(successful_transforms) = c('skew')
  successful_transforms$dv = row.names(successful_transforms)
  successful_transforms = successful_transforms %>% filter(abs(skew)<threshold)
  successful_transforms = successful_transforms$dv
  successful_transforms = positive_subset[,successful_transforms]
  dropped_vars = names(positive_subset) %w/o% names(successful_transforms)
  
  # replace transformed variables
  data = data[,-c(which(names(data) %in% names(positive_subset)))]
  names(successful_transforms) = paste0(names(successful_transforms), '.logTr')
  print.noquote(rep('*', 40))
  print(paste0('Dropping ', length(dropped_vars) ,' positively skewed data that could not be transformed successfully:'))
  print(dropped_vars)
  print.noquote(rep('*', 40))
  data = cbind(data, successful_transforms)
  
  # reflected log transform for negative skew      
  negative_subset = as.data.frame(apply(negative_subset, 2, neg_log))
  successful_transforms = as.data.frame(apply(negative_subset, 2, skew))
  names(successful_transforms) = c('skew')
  successful_transforms$dv = row.names(successful_transforms)
  successful_transforms = successful_transforms %>% filter(abs(skew)<1)
  successful_transforms = successful_transforms$dv
  successful_transforms = negative_subset[,successful_transforms]
  dropped_vars = names(negative_subset) %w/o% names(successful_transforms)
  
  # replace transformed variables
  data = data[,-c(which(names(data) %in% names(negative_subset)))]
  names(successful_transforms) = paste0(names(successful_transforms), '.ReflogTr')
  print.noquote(rep('*', 40))
  print(paste0('Dropping ', length(dropped_vars) ,' negatively skewed data that could not be transformed successfully:'))
  print(dropped_vars)
  print.noquote(rep('*', 40))
  data = cbind(data, successful_transforms)
  
  return(data)
}
```

```{r}
clean_retest_data = as.data.frame(apply(retest_data[, matching_dv_columns], 2, remove_outliers))
clean_retest_data = transform_remove_skew(clean_retest_data, matching_dv_columns)
```

I'm not sure what is the right thing to do regarding the cleaning of the test data for the retest subjects. Should they be cleaned on the whole sample for the time they were collected on and then extracted (this should have an effect on outliers as well as transformations) or extracted and then cleaned. Here I am doing the latter but this seems to be resulting in more skewed variables.

```{r}
clean_retest_subs_test_data = as.data.frame(apply(retest_subs_test_data[, matching_dv_columns], 2, remove_outliers))
clean_retest_subs_test_data = transform_remove_skew(retest_subs_test_data, matching_dv_columns)
```

But what about dropping different variables from the two dataframes

Drift diffusion variables: raw vars vs. EZ vs. hddm

Mean reliabilities for different kinds of variables (e.g. drift rates and NDs; separately for EZ and HDDM; subtraction vs basic variables - single vs. multiple)
```{r}
```

--------------------------------

Can you get the factor structure for the retest
step 1:Ttest on all measures of T1 for people w and without retest
```{r}

```

step2 : efa on the people you don't have retest on and then predict how well this holds up for people w retest on t1 and on t2

Data cleaning for efa
```{r}
#read in only meaningful vars
meaningful_var_test <- read.csv('/Users/zeynepenkavi/Documents/PoldrackLabLocal/Self_Regulation_Ontology/Data/Complete_01-31-2017/meaningful_variables_clean.csv')

meaningful_var_retest <- read.csv('/Users/zeynepenkavi/Documents/PoldrackLabLocal/Self_Regulation_Ontology/Data/Retest_02-11-2017/meaningful_variables_clean.csv')

#Process sub_id columns in the data dataframes
meaningful_var_retest$X <- as.character(meaningful_var_retest$X)
meaningful_var_test$X <- as.character(meaningful_var_test$X)

names(meaningful_var_retest)[1] <- 'sub_id'
names(meaningful_var_test)[1] <- 'sub_id'

#Clean and subset dfs as before
meaningful_var_retest <- meaningful_var_retest %>%
  group_by(sub_id) %>%
  do(correct_subids(.,test_worker_lookup))

#Are all retest subjects in test_data? No. 
sum(meaningful_var_retest$sub_id %in% meaningful_var_test$sub_id) == nrow(meaningful_var_retest)

#Who is missing?
meaningful_var_retest$sub_id[which(meaningful_var_retest$sub_id %in% meaningful_var_test$sub_id == FALSE)]

#Remove sub w missing test data from retest data
retest_data_mngf <- meaningful_var_retest[meaningful_var_retest$sub_id %in% meaningful_var_test$sub_id,]

#Extract test data for subs w retest data
retest_subs_test_mngf <- meaningful_var_test[meaningful_var_test$sub_id %in% meaningful_var_retest$sub_id,]

#Extract test data for subs without retest data (efa df)
rest_subs_test_mngf <- meaningful_var_test[meaningful_var_test$sub_id %in% meaningful_var_retest$sub_id == FALSE,]

#Arrange datasets of same size by sub_id
retest_data_mngf = retest_data_mngf %>% arrange(sub_id)
retest_subs_test_mngf = retest_subs_test_mngf %>% arrange(sub_id)

#CHECK IF EVERYTHING IS ORDERED RIGHT
retest_subs_test_mngf$sub_id == retest_data_mngf$sub_id
```

Checking columns for efa:
NEED TO RESOLVE SOME DIFFERENCES IN MEANINGFUL VARS BETWEEN THE TWO DATASETS
```{r}
#Check columns 
names(retest_data_mngf)[names(retest_data_mngf) %in% names(retest_subs_test_mngf) == FALSE]

names(retest_subs_test_mngf)[names(retest_subs_test_mngf) %in% names(retest_data_mngf) == FALSE]

#Subset datasets with only matching columns
all_columns_mngf <- unique(c(names(retest_data_mngf), names(retest_subs_test_mngf)))

matching_columns_mngf <- c()
for(i in 1:length(all_columns_mngf)){
  if(all_columns_mngf[i] %in% names(retest_data_mngf) & all_columns_mngf[i] %in% names(retest_subs_test_mngf)){
    matching_columns_mngf <- c(matching_columns_mngf, all_columns_mngf[i])
  }
}

retest_data_mngf <- retest_data_mngf[,matching_columns_mngf]
retest_subs_test_mngf <- retest_subs_test_mngf[,matching_columns_mngf]
rest_subs_test_mngf <- rest_subs_test_mngf[,matching_columns_mngf]

#standardize all vars going in to factor analysis
rest_subs_test_mngf_z <- rest_subs_test_mngf

for(i in 1:length(rest_subs_test_mngf)){
  if(names(rest_subs_test_mngf)[i] == 'sub_id'){
    rest_subs_test_mngf_z$sub_id = rest_subs_test_mngf$sub_id
  }
  else{
    rest_subs_test_mngf_z[,names(rest_subs_test_mngf)[i]] = scale(rest_subs_test_mngf[,i])
  }
}

rest_subs_test_mngf_z <- as.data.frame(rest_subs_test_mngf_z)
```


Efa on subjects from original dataset withOUT retest data (using meaningful vars only)
```{r}
unrotated <- efaUnrotate(rest_subs_test_mngf_z, nf=3, varList=names(rest_subs_test_mngf_z)[-1], estimator="mlr")

summary(unrotated, std=TRUE)
inspect(unrotated, "std")
```


```{r}
corMat <- rest_subs_test_mngf_z %>% select(-sub_id) %>% cor(.,use = 'complete.obs')

solution <- fa(r = corMat, nfactors = 3, rotate = "oblimin", fm = "pa")
```

```{r}
solution
```

 
Completion times/days
 
 - First data release email esp to ASU
 - Send the spreadsheet with meaningul variables + demographics
 
 
T1 comparison for people who came back vs didn't (of all that are invited)
